In [1]:
import re

In [6]:
#crating regex for give sample
regex = re.compile(r'<td>(.+?) &lt\;(.+\(at\).+\(dot\).+)&gt\;</td>')

In [11]:
#open file
file=open('email_sample.html','r')




In [12]:
email=[]
for line in file:
    m=regex.search(line)
    try:
        email.append((m.group(1),m.group(2)))
    except:
        pass
print 'Total Email Extracted: ',len(email)
print 'sample Email :',email[0]


Total Email Extracted:  88262
sample Email : ('Siddhartha Roy', 'siddhartha.r85(at)gmail(dot)com')

In [8]:
email[1]


Out[8]:
('Akash Rathi', 'akrathi9945(at)gmail(dot)com')

replacing (dot) & (at)


In [17]:
#Testing
replaceDot = re.compile(r'\(dot\)')
replaceAt = re.compile(r'\(at\)')

temp = replaceDot.sub('.',email[0][1])
print replaceAt.sub('@',temp)


siddhartha.r85@gmail.com

In [21]:
replaceDot = re.compile(r'\(dot\)')
replaceAt = re.compile(r'\(at\)')

PureEmail_data=[]
for rec in email:
    temp = replaceDot.sub('.',rec[1])
    PureEmail_data.append((rec[0],replaceAt.sub('@',temp)))

print "sample result :",PureEmail_data[0]


sample result : ('Siddhartha Roy', 'siddhartha.r85@gmail.com')

In [23]:
print 'Total Harvested Email :',len(PureEmail_data)
del email


Total Harvested Email : 88262

In [ ]: